import numpy as np
import torch
#import matplotlib.pyplot as plt
#import statistics
import time
import argparse
from torch.utils.data import DataLoader, TensorDataset, random_split
#from scipy.stats import pearsonr
import copy
from GenerateBeta import GenerateBeta
from EmbDataSet import EmbDataSet
from GD import GD
from SGD import SGD
from SGDwReg import SGDwReg
from SGDwOrigiReg import SGDwOrigiReg
from CVSGD import CVSGD
from ANTICVSGD import ANTICVSGD

parser = argparse.ArgumentParser()
parser.add_argument("--eps", type=int, required=True)
parser.add_argument("--SGDwReg_eps", type=int, required=True)
parser.add_argument("--SGDwOrigiReg_eps", type=int, required=True)
parser.add_argument("--lr", type=float, required=True)
parser.add_argument("--lambda1", type=float, required=True)
parser.add_argument("--lambda2", type=float, required=True)
parser.add_argument("--lambda3", type=float, required=True)
args = parser.parse_args()

NumExp = 3
NumS = 4
NumSet = 1
NumRun = 2
print(args)
print(args.eps)
eps = args.eps
SGDwReg_eps = args.SGDwReg_eps
SGDwOrigiReg_eps = args.SGDwOrigiReg_eps
lr = args.lr
lambda1 = args.lambda1
lambda2 = args.lambda2
lambda3 = args.lambda3
bs = 1
eva_bs = 5
#lr = 0.005
K = 40
KBS = 40
d = 100
emb = 100
nz = 5
N = 160
EpTimes = 40
NumCan = 4
TrainRatio = 0.25
PlotGraphs = True
Replacement = False
RandomBeta = True
#ManualSeeds = np.random.randint(100, size=NumRun)
Colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

if __name__ == '__main__':
    print('start')
    starting_time = time.time()
    torch.manual_seed(1234567)
    np.random.seed(1234567)
    ManualSeeds = np.random.randint(100, size=NumRun)
    BetaSeeds = np.random.randint(100, size=NumS)
    DataSetSeeds = np.random.randint(100, size=NumS * 2)
    criterion = torch.nn.MSELoss()
    #x_train, y_train, x_test, y_test, d, N_train, N_test = EmbDataSet(RandomSeed=DataSetSeeds[0], d=100, emb=50, N=80)
    #x_train_alt, y_train_alt, _, _, _, _, _ = EmbDataSet(RandomSeed=DataSetSeeds[1], d=100, emb=50, N=80)
    #print(y_train)
    #print(y_train_alt)
    #dataset = TensorDataset(torch.arange(40).view(-1, 1))
    #loader = DataLoader(dataset, shuffle=True, batch_size=10)
    N_train = int(N * TrainRatio)
    GDites = int(eps * EpTimes)
    GDSecondMoment = [np.zeros(GDites) for x in range(NumExp)]
    GDFirstMoment = [np.zeros(GDites) for x in range(NumExp)]
    GDTrainLossAve = [np.zeros(GDites) for x in range(NumExp)]
    GDTestLossAve = [np.zeros(GDites) for x in range(NumExp)]
    GDGradientVarianceAve = [np.zeros(GDites) for x in range(NumExp)]
    GDAccGradientVarianceAve = [np.zeros(GDites) for x in range(NumExp)]
    GDGradientNormAve = [np.zeros(GDites) for x in range(NumExp)]
    GDBootstrapLossAve = [np.zeros(GDites) for x in range(NumExp)]
    GDProductsAve = [np.zeros(GDites) for x in range(NumExp)]
    GDHessianFrobeniusesAve = [np.zeros(GDites) for x in range(NumExp)]
    GDHessianTracesAve = [np.zeros(GDites) for x in range(NumExp)]
    GDCovarianceTracesAve = [np.zeros(GDites) for x in range(NumExp)]
    GDAccProductsAve = [np.zeros(GDites) for x in range(NumExp)]
    GDSecondOrderError = [torch.zeros(1, 1) for x in range(NumExp)]
    GDTrueSecondOrderError = [torch.zeros(1, 1) for x in range(NumExp)]
    GDOneSampleDifference = [torch.zeros(1, 1) for x in range(NumExp)]
    SGDites = int(eps * N_train / bs)
    SGDSecondMoment = [np.zeros(SGDites) for x in range(NumExp)]
    SGDFirstMoment = [np.zeros(SGDites) for x in range(NumExp)]
    SGDTrainLossAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDTestLossAve = [np.zeros(SGDites) for x in range(NumExp)]
    #AccFirstMoment = [np.zeros(ites) for x in range(NumExp)]
    #AccSecondMoment = [np.zeros(ites) for x in range(NumExp)]
    SGDGradientVarianceAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDAccGradientVarianceAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDGradientNormAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDBootstrapLossAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDProductsAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDHessianFrobeniusesAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDHessianTracesAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDCovarianceTracesAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDAccProductsAve = [np.zeros(SGDites) for x in range(NumExp)]
    SGDSecondOrderError = [np.zeros(SGDites) for x in range(NumExp)]
    SGDwRegites = int(SGDwReg_eps * N_train / bs)
    SGDwRegSecondMoment = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegFirstMoment = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegTrainLossAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegTestLossAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    #AccFirstMoment = [np.zeros(ites) for x in range(NumExp)]
    #AccSecondMoment = [np.zeros(ites) for x in range(NumExp)]
    SGDwRegGradientVarianceAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegAccGradientVarianceAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegGradientNormAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegBootstrapLossAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegProductsAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegHessianFrobeniusesAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegHessianTracesAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegCovarianceTracesAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegAccProductsAve = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwRegSecondOrderError = [np.zeros(SGDwRegites) for x in range(NumExp)]
    SGDwOrigiRegites = int(SGDwOrigiReg_eps * N_train / bs)
    SGDwOrigiRegSecondMoment = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegFirstMoment = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegTrainLossAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegTestLossAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    #AccFirstMoment = [np.zeros(ites) for x in range(NumExp)]
    #AccSecondMoment = [np.zeros(ites) for x in range(NumExp)]
    SGDwOrigiRegGradientVarianceAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegAccGradientVarianceAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegGradientNormAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegBootstrapLossAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegProductsAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegHessianFrobeniusesAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegHessianTracesAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegCovarianceTracesAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegAccProductsAve = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    SGDwOrigiRegSecondOrderError = [np.zeros(SGDwOrigiRegites) for x in range(NumExp)]
    '''
    CVSGDites = int(eps * N_train / bs)# / NumCan)
    CVSGDSecondMoment = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDFirstMoment = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDTrainLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDTestLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDGradientVarianceAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDAccGradientVarianceAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDBootstrapLossAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDProductsAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDHessianFrobeniusesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDHessianTracesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    CVSGDCovarianceTracesAve = [np.zeros(CVSGDites) for x in range(NumExp)]
    ANTICVSGDites = int(eps * N_train / bs)# / NumCan)
    ANTICVSGDSecondMoment = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDFirstMoment = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDTrainLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDTestLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDGradientVarianceAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDAccGradientVarianceAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDBootstrapLossAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDProductsAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDHessianFrobeniusesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDHessianTracesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    ANTICVSGDCovarianceTracesAve = [np.zeros(ANTICVSGDites) for x in range(NumExp)]
    '''
    # Initialize the diagonal linear models
    class DiagonalLinear(torch.nn.Module):
        def __init__(self, inputSize, outputSize):
            super(DiagonalLinear, self).__init__()
            self.linear = torch.nn.Linear(inputSize, outputSize, bias=False)
            self.linearminus = torch.nn.Linear(inputSize, outputSize, bias=False)
            #self.float()
            #Initialization of the model parameters
            torch.nn.init.xavier_uniform_(self.linear.weight)
            torch.nn.init.xavier_uniform_(self.linearminus.weight)
            #torch.nn.init.kaiming_uniform_(self.linear.weight)
            #torch.nn.init.kaiming_uniform_(self.linearminus.weight)

        def forward(self, x):
            #beta = torch.mul(self.linear.weight.data, self.linear.weight.data)
                   #- torch.matmul(torch.diag(self.linearminus.weight.data), torch.diag(self.linearminus.weight.data))
            out = torch.matmul(x, torch.square(self.linear.weight).T-torch.square(self.linearminus.weight).T)
            return out

    DiaModels = []
    for run in range(NumExp):
        DiaModels.append(DiagonalLinear(d, 1))

    #Initialize the 2-layer neural network with ReLU
    class ReLUNet(torch.nn.Module):
        def __init__(self, num_layers, input_sizes):
            super(ReLUNet, self).__init__()
            self.num_layers = num_layers
            self.layers = torch.nn.ModuleList()
            for i in range(num_layers):
                self.layers.append((torch.nn.Linear(input_sizes[i], input_sizes[i + 1], bias=False)))
                torch.nn.init.kaiming_uniform_(self.layers[i].weight)
                #torch.nn.init.xavier_uniform_(self.layers[i].weight)
                #torch.nn.init.normal_(self.layers[i].weight, mean=0, std=0.1)
                #torch.nn.init.xavier_normal_(self.layers[i].weight)
            self.relu = torch.nn.ReLU()
        def forward(self, x):
            for i in range(self.num_layers - 1):
                x = self.layers[i](x)
                x = self.relu(x)
            x = self.layers[self.num_layers - 1](x)
            return x

    Num_Layers = 3
    Input_Sizes = [d, 5, 2, 1]
    ReLUModels = []
    for run in range(NumExp):
        ReLUModels.append(ReLUNet(num_layers=Num_Layers, input_sizes=Input_Sizes))

    Models = DiaModels
    for exp in range(NumExp):
        for s in range(NumS):
            beta = GenerateBeta(d=d, nz=nz, seed=BetaSeeds[s], Random=RandomBeta)
            x_train, y_train, x_test, y_test, d, N_train, N_test = EmbDataSet(beta=beta, RandomSeed=DataSetSeeds[s * 2], d=d, emb=emb,
                                                                              N=N, TrainRatio=TrainRatio)
            x_train_alt, y_train_alt, _, _, _, _, _ = EmbDataSet(beta=beta, RandomSeed=DataSetSeeds[s * 2 + 1], d=d, emb=emb, N=N, TrainRatio=TrainRatio)
            # GD on the original dataset
            GDModel, GDWTs, GDTrainLosses, GDTestLosses, GDGradientVariances, GDGradientNorms, GDBootstrapLoss, GDProducts, GDHessianFrobeniuses, GDHessianTraces, GDCovarianceTraces, GDAccProducts, GDHessian = GD(
                ini_model=Models[exp],
                x_train=x_train, y_train=y_train,
                x_test=x_test,
                y_test=y_test,
                d=d, N_train=N_train, eps=eps, bs=bs,
                learningrate=lr,
                Replacement=Replacement,
                seed=ManualSeeds[0], K=K, KBS=KBS,
                EpTimes=EpTimes, ComputeGV=True, ComputeBL=True, ComputeSingleHessian=False, z_i=x_train[0], z_i_label=y_train[0])
            GDTrainLossAve[exp] += GDTrainLosses
            GDTestLossAve[exp] += GDTestLosses
            GDGradientVarianceAve[exp] += GDGradientVariances
            GDGradientNormAve[exp] += GDGradientNorms
            GDBootstrapLossAve[exp] += GDBootstrapLoss
            GDProductsAve[exp] += GDProducts
            GDHessianFrobeniusesAve[exp] += GDHessianFrobeniuses
            GDHessianTracesAve[exp] += GDHessianTraces
            GDCovarianceTracesAve[exp] += GDCovarianceTraces
            GDAccProductsAve[exp] += GDAccProducts
            # SGD on the original dataset
            SGDWTSet = []
            for run in range(NumRun):
                SGDWTs, SGDTrainLosses, SGDTestLosses, SGDGradientVariances, SGDGradientNorms, SGDBootstrapLoss, SGDProducts, SGDHessianFrobeniuses, SGDHessianTraces, SGDCovarianceTraces, SGDAccProducts, SGDHessian = SGD(
                    ini_model=Models[exp],
                    x_train=x_train, y_train=y_train, x_test=x_test,
                    y_test=y_test,
                    d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                    Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS,
                    ComputeGV=True, ComputeBL=True)
                SGDTrainLossAve[exp] += SGDTrainLosses
                SGDTestLossAve[exp] += SGDTestLosses
                SGDGradientVarianceAve[exp] += SGDGradientVariances
                SGDGradientNormAve[exp] += SGDGradientNorms
                SGDBootstrapLossAve[exp] += SGDBootstrapLoss
                SGDProductsAve[exp] += SGDProducts
                SGDHessianFrobeniusesAve[exp] += SGDHessianFrobeniuses
                SGDHessianTracesAve[exp] += SGDHessianTraces
                SGDCovarianceTracesAve[exp] += SGDCovarianceTraces
                SGDAccProductsAve[exp] += SGDAccProducts
                SGDWTSet.append(SGDWTs)
            # SGDwReg on the original dataset
            SGDwRegWTSet = []
            for run in range(NumRun):
                SGDwRegWTs, SGDwRegTrainLosses, SGDwRegTestLosses, SGDwRegGradientVariances, SGDwRegGradientNorms, SGDwRegBootstrapLoss, SGDwRegProducts, SGDwRegHessianFrobeniuses, SGDwRegHessianTraces, SGDwRegCovarianceTraces, SGDwRegAccProducts, SGDwRegHessian = SGDwReg(
                    ini_model=Models[exp],
                    x_train=x_train, y_train=y_train, x_test=x_test,
                    y_test=y_test,
                    d=d, N_train=N_train, eps=SGDwReg_eps, bs=bs, learningrate=lr,
                    Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS,
                    ComputeGV=True, ComputeBL=True, lambda1=lambda1, lambda2=lambda2)
                SGDwRegTrainLossAve[exp] += SGDwRegTrainLosses
                SGDwRegTestLossAve[exp] += SGDwRegTestLosses
                SGDwRegGradientVarianceAve[exp] += SGDwRegGradientVariances
                SGDwRegGradientNormAve[exp] += SGDwRegGradientNorms
                SGDwRegBootstrapLossAve[exp] += SGDwRegBootstrapLoss
                SGDwRegProductsAve[exp] += SGDwRegProducts
                SGDwRegHessianFrobeniusesAve[exp] += SGDwRegHessianFrobeniuses
                SGDwRegHessianTracesAve[exp] += SGDwRegHessianTraces
                SGDwRegCovarianceTracesAve[exp] += SGDwRegCovarianceTraces
                SGDwRegAccProductsAve[exp] += SGDwRegAccProducts
                SGDwRegWTSet.append(SGDwRegWTs)
            # SGDwOrigiReg on the original dataset
            SGDwOrigiRegWTSet = []
            for run in range(NumRun):
                SGDwOrigiRegWTs, SGDwOrigiRegTrainLosses, SGDwOrigiRegTestLosses, SGDwOrigiRegGradientVariances, SGDwOrigiRegGradientNorms, SGDwOrigiRegBootstrapLoss, SGDwOrigiRegProducts, SGDwOrigiRegHessianFrobeniuses, SGDwOrigiRegHessianTraces, SGDwOrigiRegCovarianceTraces, SGDwOrigiRegAccProducts, SGDwOrigiRegHessian = SGDwOrigiReg(
                    ini_model=Models[exp],
                    x_train=x_train, y_train=y_train, x_test=x_test,
                    y_test=y_test,
                    d=d, N_train=N_train, eps=SGDwOrigiReg_eps, bs=bs, learningrate=lr,
                    Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS,
                    ComputeGV=True, ComputeBL=True, lambda3=lambda3)
                SGDwOrigiRegTrainLossAve[exp] += SGDwOrigiRegTrainLosses
                SGDwOrigiRegTestLossAve[exp] += SGDwOrigiRegTestLosses
                SGDwOrigiRegGradientVarianceAve[exp] += SGDwOrigiRegGradientVariances
                SGDwOrigiRegGradientNormAve[exp] += SGDwOrigiRegGradientNorms
                SGDwOrigiRegBootstrapLossAve[exp] += SGDwOrigiRegBootstrapLoss
                SGDwOrigiRegProductsAve[exp] += SGDwOrigiRegProducts
                SGDwOrigiRegHessianFrobeniusesAve[exp] += SGDwOrigiRegHessianFrobeniuses
                SGDwOrigiRegHessianTracesAve[exp] += SGDwOrigiRegHessianTraces
                SGDwOrigiRegCovarianceTracesAve[exp] += SGDwOrigiRegCovarianceTraces
                SGDwOrigiRegAccProductsAve[exp] += SGDwOrigiRegAccProducts
                SGDwOrigiRegWTSet.append(SGDwOrigiRegWTs)
            for i in range(NumSet):
                # Construct S(i)
                x_train_i = copy.copy(x_train)
                y_train_i = copy.copy(y_train)
                #x_train_i[i] = x_train_alt[i]
                #y_train_i[i] = y_train_alt[i]
                x_train_i[0] = x_train_alt[i]
                y_train_i[0] = y_train_alt[i]
                # GD on the disturbed datasets
                GDModel_is, GDWT_is, _, _, _, _, _, _, _, _, _, _, GDHessian_is = GD(ini_model=Models[exp],
                                      x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                                      y_test=y_test,
                                      d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                                      Replacement=Replacement, seed=ManualSeeds[0], K=K, KBS=KBS, EpTimes=EpTimes,
                                      ComputeGV=False, ComputeBL=False, ComputeSingleHessian=True, z_i=x_train_i[0], z_i_label=y_train_i[0])
                Residuals = GDWTs - GDWT_is
                #print((Residuals[-1].view(1,-1) @ Residuals[-1].view(-1,1)))
                print(Residuals[-1])
                print(Residuals[-1].shape)
                print(torch.norm(Residuals[-1], p=2, dim=0))
                GDTrueSecondOrderError[exp] += Residuals[-100:-1].mean(axis=0).view(1, -1) @ GDHessian_is @ Residuals[-100:-1].mean(axis=0).view(-1, 1)
                GDSecondOrderError[exp] += Residuals[-100:-1].mean(axis=0).view(1, -1) @ GDHessian @ Residuals[-100:-1].mean(axis=0).view(-1, 1)
                # Compute L(z',A(S)) - L(z',A(S'))
                GDOneSampleDifference[exp] += criterion(GDModel(torch.from_numpy(x_train_i[0])), torch.from_numpy(y_train_i[0])).item() - criterion(GDModel_is(torch.from_numpy(x_train_i[0])), torch.from_numpy(y_train_i[0])).item()
                Norms = torch.norm(Residuals, p=2, dim=1)
                # print(Residual)
                for ite in range(GDites):
                    GDFirstMoment[exp][ite] += Norms[ite]
                    GDSecondMoment[exp][ite] += Norms[ite] ** 2
                for run in range(NumRun):
                    # SGD on disturbed datasets
                    SGDWT_is, _, _, _, _, _, _, _, _, _, _, _ = SGD(ini_model=Models[exp],
                             x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                             y_test=y_test,
                             d=d, N_train=N_train, eps=eps, bs=bs, learningrate=lr,
                             Replacement=Replacement, seed=ManualSeeds[run], K=K, KBS=KBS, ComputeGV=False, ComputeBL=False)
                    Residuals = SGDWTSet[run] - SGDWT_is
                    Norms = torch.norm(Residuals, p=2, dim=1)
                    #print(Residual)
                    for ite in range(SGDites):
                        SGDFirstMoment[exp][ite] += Norms[ite]
                        SGDSecondMoment[exp][ite] += Norms[ite] ** 2
                    # SGDwReg on disturbed datasets
                    SGDwRegWT_is, _, _, _, _, _, _, _, _, _, _, _ = SGDwReg(ini_model=Models[exp],
                                                                    x_train=x_train_i, y_train=y_train_i, x_test=x_test,
                                                                    y_test=y_test,
                                                                    d=d, N_train=N_train, eps=SGDwReg_eps, bs=bs,
                                                                    learningrate=lr,
                                                                    Replacement=Replacement, seed=ManualSeeds[run], K=K,
                                                                    KBS=KBS, ComputeGV=False, ComputeBL=False, lambda1=lambda1, lambda2=lambda2)
                    Residuals = SGDwRegWTSet[run] - SGDwRegWT_is
                    Norms = torch.norm(Residuals, p=2, dim=1)
                    # print(Residual)
                    for ite in range(SGDwRegites):
                        SGDwRegFirstMoment[exp][ite] += Norms[ite]
                        SGDwRegSecondMoment[exp][ite] += Norms[ite] ** 2
                    # SGDwOrigiReg on disturbed datasets
                    SGDwOrigiRegWT_is, _, _, _, _, _, _, _, _, _, _, _ = SGDwOrigiReg(ini_model=Models[exp],
                                                                            x_train=x_train_i, y_train=y_train_i,
                                                                            x_test=x_test,
                                                                            y_test=y_test,
                                                                            d=d, N_train=N_train, eps=SGDwOrigiReg_eps, bs=bs,
                                                                            learningrate=lr,
                                                                            Replacement=Replacement,
                                                                            seed=ManualSeeds[run], K=K,
                                                                            KBS=KBS, ComputeGV=False, ComputeBL=False, lambda3=lambda3)
                    Residuals = SGDwOrigiRegWTSet[run] - SGDwOrigiRegWT_is
                    Norms = torch.norm(Residuals, p=2, dim=1)
                    # print(Residual)
                    for ite in range(SGDwOrigiRegites):
                        SGDwOrigiRegFirstMoment[exp][ite] += Norms[ite]
                        SGDwOrigiRegSecondMoment[exp][ite] += Norms[ite] ** 2

        GDFirstMoment[exp] /= (NumS * NumSet)
        GDSecondMoment[exp] /= (NumS * NumSet)
        GDTrainLossAve[exp] /= NumS
        GDTestLossAve[exp] /= NumS
        GDGradientVarianceAve[exp] /= NumS
        GDGradientNormAve[exp] /= NumS
        GDBootstrapLossAve[exp] /= NumS
        GDProductsAve[exp] /= NumS
        GDHessianFrobeniusesAve[exp] /= NumS
        GDHessianTracesAve[exp] /= NumS
        GDCovarianceTracesAve[exp] /= NumS
        GDAccProductsAve[exp] /= NumS
        GDSecondOrderError[exp] /= (NumS * NumSet)
        GDTrueSecondOrderError[exp] /= (NumS * NumSet)
        GDOneSampleDifference[exp] /= (NumS * NumSet)
        GDAccGradientVarianceAve[exp] = copy.copy(GDGradientVarianceAve[exp])

        SGDFirstMoment[exp] /= (NumS * NumSet * NumRun)
        SGDSecondMoment[exp] /= (NumS * NumSet * NumRun)
        SGDTrainLossAve[exp] /= (NumS * NumRun)
        SGDTestLossAve[exp] /= (NumS * NumRun)
        SGDGradientVarianceAve[exp] /= (NumS * NumRun)
        SGDGradientNormAve[exp] /= (NumS * NumRun)
        SGDBootstrapLossAve[exp] /= (NumS * NumRun)
        SGDProductsAve[exp] /= (NumS * NumRun)
        SGDHessianFrobeniusesAve[exp] /= (NumS * NumRun)
        SGDHessianTracesAve[exp] /= (NumS * NumRun)
        SGDCovarianceTracesAve[exp] /= (NumS * NumRun)
        SGDAccProductsAve[exp] /= (NumS * NumRun)
        SGDAccGradientVarianceAve[exp] = copy.copy(SGDGradientVarianceAve[exp])

        SGDwRegFirstMoment[exp] /= (NumS * NumSet * NumRun)
        SGDwRegSecondMoment[exp] /= (NumS * NumSet * NumRun)
        SGDwRegTrainLossAve[exp] /= (NumS * NumRun)
        SGDwRegTestLossAve[exp] /= (NumS * NumRun)
        SGDwRegGradientVarianceAve[exp] /= (NumS * NumRun)
        SGDwRegGradientNormAve[exp] /= (NumS * NumRun)
        SGDwRegBootstrapLossAve[exp] /= (NumS * NumRun)
        SGDwRegProductsAve[exp] /= (NumS * NumRun)
        SGDwRegHessianFrobeniusesAve[exp] /= (NumS * NumRun)
        SGDwRegHessianTracesAve[exp] /= (NumS * NumRun)
        SGDwRegCovarianceTracesAve[exp] /= (NumS * NumRun)
        SGDwRegAccProductsAve[exp] /= (NumS * NumRun)
        SGDwRegAccGradientVarianceAve[exp] = copy.copy(SGDwRegGradientVarianceAve[exp])

        SGDwOrigiRegFirstMoment[exp] /= (NumS * NumSet * NumRun)
        SGDwOrigiRegSecondMoment[exp] /= (NumS * NumSet * NumRun)
        SGDwOrigiRegTrainLossAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegTestLossAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegGradientVarianceAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegGradientNormAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegBootstrapLossAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegProductsAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegHessianFrobeniusesAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegHessianTracesAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegCovarianceTracesAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegAccProductsAve[exp] /= (NumS * NumRun)
        SGDwOrigiRegAccGradientVarianceAve[exp] = copy.copy(SGDwOrigiRegGradientVarianceAve[exp])

        #AccFirstMoment[exp] = copy.copy(FirstMoment[exp])
        #AccFirstMoment[exp] = AccFirstMoment[exp] ** 2
        #AccSecondMoment[exp] = copy.copy(SecondMoment[exp])
        for ite in range(GDites):
            if ite > 0:
                GDAccGradientVarianceAve[exp][ite] = GDAccGradientVarianceAve[exp][ite] + GDAccGradientVarianceAve[exp][ite - 1]
        for ite in range(SGDites):
            if ite > 0:
                SGDAccGradientVarianceAve[exp][ite] = SGDAccGradientVarianceAve[exp][ite] + SGDAccGradientVarianceAve[exp][ite - 1]
        for ite in range(SGDwRegites):
            if ite > 0:
                SGDwRegAccGradientVarianceAve[exp][ite] = SGDwRegAccGradientVarianceAve[exp][ite] + SGDwRegAccGradientVarianceAve[exp][ite - 1]
        for ite in range(SGDwOrigiRegites):
            if ite > 0:
                SGDwOrigiRegAccGradientVarianceAve[exp][ite] = SGDwOrigiRegAccGradientVarianceAve[exp][ite] + \
                                                          SGDwOrigiRegAccGradientVarianceAve[exp][ite - 1]



        #        AccFirstMoment[exp][ite] = AccFirstMoment[exp][ite] + AccFirstMoment[exp][ite - 1]
        #        AccSecondMoment[exp][ite] = AccSecondMoment[exp][ite] + AccSecondMoment[exp][ite - 1]
    #print("The Squared First Moment is {}".format(FirstMoment ** 2))
    #print("The Second Moment is {}".format(SecondMoment))
    #print("The Accumulated Second Moment is {}".format(AccSecondMoment))
    '''
    if PlotGraphs == True:
        #print(TestLosses)
        #GD plots
        plt.plot(range(GDites), GDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
        plt.plot(range(GDites), GDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
            plt.plot(range(GDites), GDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("First and Second Moments of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        #plt.ylabel("Train Gradient Variances")
        plt.savefig("GDMoments")
        plt.close()

        plt.plot(range(GDites), GDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Losses of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss")
        plt.savefig("GDTrainLosses")
        plt.close()

        plt.plot(range(GDites), GDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Test Losses of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Test Loss")
        plt.savefig("GDTestLosses")
        plt.close()

        plt.plot(range(GDites), GDTestLossAve[0] - GDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDTestLossAve[exp] - GDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Generalization Gaps of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Generalization Gap")
        plt.savefig("GDGeneralizationGaps")
        plt.close()

        plt.plot(range(GDites), GDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Loss Variance of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss Variance")
        plt.savefig("GDTrainLossVariance")
        plt.close()

        plt.plot(range(GDites), GDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Variances of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Variance")
        plt.savefig("GDGradientVariances")
        plt.close()

        plt.plot(range(GDites), GDGradientNormAve[0], color=Colors[0], linestyle='solid', label='Gradient Norm Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDGradientNormAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Norm Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Norms of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Norm")
        plt.savefig("GDGradientNorms")
        plt.close()

        plt.plot(range(GDites), GDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Gradient Variances of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Gradient Variance")
        plt.savefig("GDAccGradientVariances")
        plt.close()

        plt.plot(range(GDites), GDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Trace Product of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Trace Product")
        plt.savefig("GDTraceProducts")
        plt.close()

        plt.plot(range(GDites), GDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Frobenius of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Frobenius")
        plt.savefig("GDHessianFrobeniuses")
        plt.close()

        plt.plot(range(GDites), GDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Trace of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Trace")
        plt.savefig("GDHessianTraces")
        plt.close()

        plt.plot(range(GDites), GDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Covariance Matrix Trace of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Covariance Matrix Trace")
        plt.savefig("GDCovarianceMatrixTraces")
        plt.close()

        plt.plot(range(GDites), GDAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Trace Product of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Trace Product")
        plt.savefig("GDAccTraceProducts")
        plt.close()

        plt.plot(range(GDites), GDSecondMoment[0] / (GDFirstMoment[0] ** 2), color=Colors[0], linestyle='solid', label='Moment Ratio 1')
        print('GD second moment is {}'.format(GDSecondMoment[0]))
        print('GD squared first moment is {}'.format(GDFirstMoment[0] ** 2))
        print('Ratio is {}'.format(GDSecondMoment[0] / GDFirstMoment[0] ** 2))
        for exp in range(1, NumExp):
            plt.plot(range(GDites), GDSecondMoment[exp] / GDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("Ratio between the Second Moments and the Squared First Moments of GD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Ratio")
        plt.savefig("GDMomentRatios")
        plt.close()

        # SGD plots
        plt.plot(range(SGDites), SGDFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
        plt.plot(range(SGDites), SGDSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
            plt.plot(range(SGDites), SGDSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("First and Second Moments of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        #plt.ylabel("Train Gradient Variances")
        plt.savefig("SGDMoments")
        plt.close()

        plt.plot(range(SGDites), SGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Losses of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss")
        plt.savefig("SGDTrainLosses")
        plt.close()

        plt.plot(range(SGDites), SGDTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Test Losses of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Test Loss")
        plt.savefig("SGDTestLosses")
        plt.close()

        plt.plot(range(SGDites), SGDTestLossAve[0] - SGDTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDTestLossAve[exp] - SGDTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Generalization Gaps of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Generalization Gap")
        plt.savefig("SGDGeneralizationGaps")
        plt.close()

        plt.plot(range(SGDites), SGDBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Loss Variance of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss Variance")
        plt.savefig("SGDTrainLossVariance")
        plt.close()

        plt.plot(range(SGDites), SGDGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Variances of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Variance")
        plt.savefig("SGDGradientVariances")
        plt.close()

        plt.plot(range(SGDites), SGDGradientNormAve[0], color=Colors[0], linestyle='solid', label='Gradient Norm Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDGradientNormAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Norm Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Norms of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Norm")
        plt.savefig("SGDGradientNorms")
        plt.close()

        plt.plot(range(SGDites), SGDAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Gradient Variances of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Gradient Variance")
        plt.savefig("SGDAccGradientVariances")
        plt.close()

        plt.plot(range(SGDites), SGDProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Trace Product of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Trace Product")
        plt.savefig("SGDTraceProducts")
        plt.close()

        plt.plot(range(SGDites), SGDHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Frobenius of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Frobenius")
        plt.savefig("SGDHessianFrobeniuses")
        plt.close()

        plt.plot(range(SGDites), SGDHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Trace of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Trace")
        plt.savefig("SGDHessianTraces")
        plt.close()

        plt.plot(range(SGDites), SGDCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Covariance Matrix Trace of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Covariance Matrix Trace")
        plt.savefig("SGDCovarianceMatrixTraces")
        plt.close()

        plt.plot(range(SGDites), SGDAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Trace Product of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Trace Product")
        plt.savefig("SGDAccTraceProducts")
        plt.close()

        plt.plot(range(SGDites), SGDSecondMoment[0] / SGDFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDites), SGDSecondMoment[exp] / SGDFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("Ratio between the Second Moments and the Squared First Moments of SGD (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Ratio")
        plt.savefig("SGDMomentRatios")
        plt.close()

        # SGDwReg plots
        plt.plot(range(SGDwRegites), SGDwRegFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
        plt.plot(range(SGDwRegites), SGDwRegSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
            plt.plot(range(SGDwRegites), SGDwRegSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("First and Second Moments of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        #plt.ylabel("Train Gradient Variances")
        plt.savefig("SGDwRegMoments")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Losses of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss")
        plt.savefig("SGDwRegTrainLosses")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Test Losses of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Test Loss")
        plt.savefig("SGDwRegTestLosses")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegTestLossAve[0] - SGDwRegTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegTestLossAve[exp] - SGDwRegTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Generalization Gaps of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Generalization Gap")
        plt.savefig("SGDwRegGeneralizationGaps")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Loss Variance of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss Variance")
        plt.savefig("SGDwRegTrainLossVariance")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Variances of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Variance")
        plt.savefig("SGDwRegGradientVariances")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegGradientNormAve[0], color=Colors[0], linestyle='solid', label='Gradient Norm Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegGradientNormAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Norm Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Norms of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Norm")
        plt.savefig("SGDwRegGradientNorms")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Gradient Variances of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Gradient Variance")
        plt.savefig("SGDwRegAccGradientVariances")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Trace Product of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Trace Product")
        plt.savefig("SGDwRegTraceProducts")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Frobenius of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Frobenius")
        plt.savefig("SGDwRegHessianFrobeniuses")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Trace of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Trace")
        plt.savefig("SGDwRegHessianTraces")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Covariance Matrix Trace of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Covariance Matrix Trace")
        plt.savefig("SGDwRegCovarianceMatrixTraces")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Trace Product of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Trace Product")
        plt.savefig("SGDwRegAccTraceProducts")
        plt.close()

        plt.plot(range(SGDwRegites), SGDwRegSecondMoment[0] / SGDwRegFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwRegites), SGDwRegSecondMoment[exp] / SGDwRegFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("Ratio between the Second Moments and the Squared First Moments of SGDwReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Ratio")
        plt.savefig("SGDwRegMomentRatios")
        plt.close()

        # SGDwOrigiReg plots
        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegFirstMoment[0] ** 2, color=Colors[0], linestyle='dotted', label='First Moments 1')
        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegSecondMoment[0], color=Colors[0], linestyle='dashed', label='Second Moments 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegFirstMoment[exp] ** 2, color=Colors[exp], linestyle='dotted', label='First Moments {}'.format(str(exp+1)))
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegSecondMoment[exp], color=Colors[exp], linestyle='dashed', label='Second Moments {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("First and Second Moments of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        #plt.ylabel("Train Gradient Variances")
        plt.savefig("SGDwOrigiRegMoments")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTrainLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Losses of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss")
        plt.savefig("SGDwOrigiRegTrainLosses")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTestLossAve[0], color=Colors[0], linestyle='solid', label='Test Loss Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTestLossAve[exp], color=Colors[exp], linestyle='solid', label='Test Loss Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Test Losses of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Test Loss")
        plt.savefig("SGDwOrigiRegTestLosses")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTestLossAve[0] - SGDwOrigiRegTrainLossAve[0], color=Colors[0], linestyle='solid', label='Generalization Gap Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegTestLossAve[exp] - SGDwOrigiRegTrainLossAve[exp], color=Colors[exp], linestyle='solid', label='Generalization Gap Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Generalization Gaps of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Generalization Gap")
        plt.savefig("SGDwOrigiRegGeneralizationGaps")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegBootstrapLossAve[0], color=Colors[0], linestyle='solid', label='Train Loss Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegBootstrapLossAve[exp], color=Colors[exp], linestyle='solid', label='Train Loss Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Train Loss Variance of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Train Loss Variance")
        plt.savefig("SGDwOrigiRegTrainLossVariance")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Variances of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Variance")
        plt.savefig("SGDwOrigiRegGradientVariances")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegGradientNormAve[0], color=Colors[0], linestyle='solid', label='Gradient Norm Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegGradientNormAve[exp], color=Colors[exp], linestyle='solid', label='Gradient Norm Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Gradient Norms of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Gradient Norm")
        plt.savefig("SGDwOrigiRegGradientNorms")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegAccGradientVarianceAve[0], color=Colors[0], linestyle='solid', label='Accumulated Gradient Variance Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegAccGradientVarianceAve[exp], color=Colors[exp], linestyle='solid', label='Accumulated Gradient Variance Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Gradient Variances of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Gradient Variance")
        plt.savefig("SGDwOrigiRegAccGradientVariances")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegProductsAve[0], color=Colors[0], linestyle='solid', label='Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegProductsAve[exp], color=Colors[exp], linestyle='solid', label='Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Trace Product of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Trace Product")
        plt.savefig("SGDwOrigiRegTraceProducts")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegHessianFrobeniusesAve[0], color=Colors[0], linestyle='solid', label='Hessian Frobenius Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegHessianFrobeniusesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Frobenius Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Frobenius of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Frobenius")
        plt.savefig("SGDwOrigiRegHessianFrobeniuses")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegHessianTracesAve[0], color=Colors[0], linestyle='solid', label='Hessian Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegHessianTracesAve[exp], color=Colors[exp], linestyle='solid', label='Hessian Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Hessian Trace of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Hessian Trace")
        plt.savefig("SGDwOrigiRegHessianTraces")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegCovarianceTracesAve[0], color=Colors[0], linestyle='solid', label='Covariance Matrix Trace Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegCovarianceTracesAve[exp], color=Colors[exp], linestyle='solid', label='Covariance Matrix Trace Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Covariance Matrix Trace of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Covariance Matrix Trace")
        plt.savefig("SGDwOrigiRegCovarianceMatrixTraces")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegAccProductsAve[0], color=Colors[0], linestyle='solid', label='Acc Trace Product Average 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegAccProductsAve[exp], color=Colors[exp], linestyle='solid', label='Acc Trace Product Average {}'.format(str(exp+1)))
        plt.legend()
        plt.yscale('log')
        plt.title("Average Accumulated Trace Product of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Accumulated Trace Product")
        plt.savefig("SGDwOrigiRegAccTraceProducts")
        plt.close()

        plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegSecondMoment[0] / SGDwOrigiRegFirstMoment[0] ** 2, color=Colors[0], linestyle='solid', label='Moment Ratio 1')
        for exp in range(1, NumExp):
            plt.plot(range(SGDwOrigiRegites), SGDwOrigiRegSecondMoment[exp] / SGDwOrigiRegFirstMoment[exp] ** 2, color=Colors[exp], linestyle='solid', label='Moment Ratio {}'.format(str(exp+1)))
        plt.legend()
        #plt.yscale('log')
        plt.title("Ratio between the Second Moments and the Squared First Moments of SGDwOrigiReg (d={}, N={}, bs={}, 5-sparse beta)".format(str(d),str(int(N*TrainRatio)),str(bs)))
        plt.xlabel("Iteration")
        plt.ylabel("Ratio")
        plt.savefig("SGDwOrigiRegMomentRatios")
        plt.close()
    '''

    print("Run time is {}".format(time.time() - starting_time))
    print("Last 1000")
    for j in range(NumExp):
        print("GD {} Generalization Gap is {}".format(j+1, GDTestLossAve[j][-1000:-1].mean() - GDTrainLossAve[j][-1000:-1].mean()))
        print("GD {} Moment is {}".format(j+1, GDSecondMoment[j][-1000:-1].mean()))
        print("GD {} Accumulated Gradient Variance is {}".format(j+1, GDAccGradientVarianceAve[j][-1000:-1].mean()))
        print("GD {} Training Loss Variance is {}".format(j + 1, GDBootstrapLossAve[j][-1000:-1].mean()))
        print("GD {} Training Loss is {}".format(j + 1, GDTrainLossAve[j][-1000:-1].mean()))
        print("GD {} Testing Loss is {}".format(j + 1, GDTestLossAve[j][-1000:-1].mean()))
        print("GD {} Trace Product is {}".format(j + 1, GDProductsAve[j][-1000:-1].mean()))
        print("GD {} Hessian Frobenius is {}".format(j + 1, GDHessianFrobeniusesAve[j][-1000:-1].mean()))
        print("GD {} Hessian Trace is {}".format(j + 1, GDHessianTracesAve[j][-1000:-1].mean()))
        print("GD {} Covariance Matrix Trace is {}".format(j + 1, GDCovarianceTracesAve[j][-1000:-1].mean()))
        print("GD {} Accumulated Trace Product is {}".format(j + 1, GDAccProductsAve[j][-1000:-1].mean()))

    for j in range(NumExp):
        print("SGD {} Generalization Gap is {}".format(j+1, SGDTestLossAve[j][-1000:-1].mean() - SGDTrainLossAve[j][-1000:-1].mean()))
        print("SGD {} Moment is {}".format(j+1, SGDSecondMoment[j][-1000:-1].mean()))
        print("SGD {} Accumulated Gradient Variance is {}".format(j+1, SGDAccGradientVarianceAve[j][-1000:-1].mean()))
        print("SGD {} Training Loss Variance is {}".format(j + 1, SGDBootstrapLossAve[j][-1000:-1].mean()))
        print("SGD {} Training Loss is {}".format(j + 1, SGDTrainLossAve[j][-1000:-1].mean()))
        print("SGD {} Testing Loss is {}".format(j + 1, SGDTestLossAve[j][-1000:-1].mean()))
        print("SGD {} Trace Product is {}".format(j + 1, SGDProductsAve[j][-1000:-1].mean()))
        print("SGD {} Hessian Frobenius is {}".format(j + 1, SGDHessianFrobeniusesAve[j][-1000:-1].mean()))
        print("SGD {} Hessian Trace is {}".format(j + 1, SGDHessianTracesAve[j][-1000:-1].mean()))
        print("SGD {} Covariance Matrix Trace is {}".format(j + 1, SGDCovarianceTracesAve[j][-1000:-1].mean()))
        print("SGD {} Accumulated Trace Product is {}".format(j + 1, SGDAccProductsAve[j][-1000:-1].mean()))

    for j in range(NumExp):
        print("SGDwReg {} Generalization Gap is {}".format(j+1, SGDwRegTestLossAve[j][-1000:-1].mean() - SGDwRegTrainLossAve[j][-1000:-1].mean()))
        print("SGDwReg {} Moment is {}".format(j+1, SGDwRegSecondMoment[j][-1000:-1].mean()))
        print("SGDwReg {} Accumulated Gradient Variance is {}".format(j+1, SGDwRegAccGradientVarianceAve[j][-1000:-1].mean()))
        print("SGDwReg {} Training Loss Variance is {}".format(j + 1, SGDwRegBootstrapLossAve[j][-1000:-1].mean()))
        print("SGDwReg {} Training Loss is {}".format(j + 1, SGDwRegTrainLossAve[j][-1000:-1].mean()))
        print("SGDwReg {} Testing Loss is {}".format(j + 1, SGDwRegTestLossAve[j][-1000:-1].mean()))
        print("SGDwReg {} Trace Product is {}".format(j + 1, SGDwRegProductsAve[j][-1000:-1].mean()))
        print("SGDwReg {} Hessian Frobenius is {}".format(j + 1, SGDwRegHessianFrobeniusesAve[j][-1000:-1].mean()))
        print("SGDwReg {} Hessian Trace is {}".format(j + 1, SGDwRegHessianTracesAve[j][-1000:-1].mean()))
        print("SGDwReg {} Covariance Matrix Trace is {}".format(j + 1, SGDwRegCovarianceTracesAve[j][-1000:-1].mean()))
        print("SGDwReg {} Accumulated Trace Product is {}".format(j + 1, SGDwRegAccProductsAve[j][-1000:-1].mean()))

    for j in range(NumExp):
        print("SGDwOrigiReg {} Generalization Gap is {}".format(j+1, SGDwOrigiRegTestLossAve[j][-1000:-1].mean() - SGDwOrigiRegTrainLossAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Moment is {}".format(j+1, SGDwOrigiRegSecondMoment[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Accumulated Gradient Variance is {}".format(j+1, SGDwOrigiRegAccGradientVarianceAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Training Loss Variance is {}".format(j + 1, SGDwOrigiRegBootstrapLossAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Training Loss is {}".format(j + 1, SGDwOrigiRegTrainLossAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Testing Loss is {}".format(j + 1, SGDwOrigiRegTestLossAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Trace Product is {}".format(j + 1, SGDwOrigiRegProductsAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Hessian Frobenius is {}".format(j + 1, SGDwOrigiRegHessianFrobeniusesAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Hessian Trace is {}".format(j + 1, SGDwOrigiRegHessianTracesAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Covariance Matrix Trace is {}".format(j + 1, SGDwOrigiRegCovarianceTracesAve[j][-1000:-1].mean()))
        print("SGDwOrigiReg {} Accumulated Trace Product is {}".format(j + 1, SGDwOrigiRegAccProductsAve[j][-1000:-1].mean()))

    print("Last 200")
    for j in range(NumExp):
        print("GD {} Generalization Gap is {}".format(j+1, GDTestLossAve[j][-200:-1].mean() - GDTrainLossAve[j][-200:-1].mean()))
        print("GD {} Moment is {}".format(j+1, GDSecondMoment[j][-200:-1].mean()))
        print("GD {} Accumulated Gradient Variance is {}".format(j+1, GDAccGradientVarianceAve[j][-200:-1].mean()))
        print("GD {} Training Loss Variance is {}".format(j + 1, GDBootstrapLossAve[j][-200:-1].mean()))
        print("GD {} Training Loss is {}".format(j + 1, GDTrainLossAve[j][-200:-1].mean()))
        print("GD {} Testing Loss is {}".format(j + 1, GDTestLossAve[j][-200:-1].mean()))
        print("GD {} Trace Product is {}".format(j + 1, GDProductsAve[j][-200:-1].mean()))
        print("GD {} Hessian Frobenius is {}".format(j + 1, GDHessianFrobeniusesAve[j][-200:-1].mean()))
        print("GD {} Hessian Trace is {}".format(j + 1, GDHessianTracesAve[j][-200:-1].mean()))
        print("GD {} Covariance Matrix Trace is {}".format(j + 1, GDCovarianceTracesAve[j][-200:-1].mean()))
        print("GD {} Accumulated Trace Product is {}".format(j + 1, GDAccProductsAve[j][-200:-1].mean()))
        print("GD {} Second Order Correction is {}".format(j + 1, GDSecondOrderError[j]))
        print("GD {} True Second Order Correction is {}".format(j + 1, GDTrueSecondOrderError[j]))
        print("GD {} Average One Sample Difference is {}".format(j + 1, GDOneSampleDifference[j]))

    for j in range(NumExp):
        print("SGD {} Generalization Gap is {}".format(j+1, SGDTestLossAve[j][-200:-1].mean() - SGDTrainLossAve[j][-200:-1].mean()))
        print("SGD {} Moment is {}".format(j+1, SGDSecondMoment[j][-200:-1].mean()))
        print("SGD {} Accumulated Gradient Variance is {}".format(j+1, SGDAccGradientVarianceAve[j][-200:-1].mean()))
        print("SGD {} Training Loss Variance is {}".format(j + 1, SGDBootstrapLossAve[j][-200:-1].mean()))
        print("SGD {} Training Loss is {}".format(j + 1, SGDTrainLossAve[j][-200:-1].mean()))
        print("SGD {} Testing Loss is {}".format(j + 1, SGDTestLossAve[j][-200:-1].mean()))
        print("SGD {} Trace Product is {}".format(j + 1, SGDProductsAve[j][-200:-1].mean()))
        print("SGD {} Hessian Frobenius is {}".format(j + 1, SGDHessianFrobeniusesAve[j][-200:-1].mean()))
        print("SGD {} Hessian Trace is {}".format(j + 1, SGDHessianTracesAve[j][-200:-1].mean()))
        print("SGD {} Covariance Matrix Trace is {}".format(j + 1, SGDCovarianceTracesAve[j][-200:-1].mean()))
        print("SGD {} Accumulated Trace Product is {}".format(j + 1, SGDAccProductsAve[j][-200:-1].mean()))

    for j in range(NumExp):
        print("SGDwReg {} Generalization Gap is {}".format(j+1, SGDwRegTestLossAve[j][-200:-1].mean() - SGDwRegTrainLossAve[j][-200:-1].mean()))
        print("SGDwReg {} Moment is {}".format(j+1, SGDwRegSecondMoment[j][-200:-1].mean()))
        print("SGDwReg {} Accumulated Gradient Variance is {}".format(j+1, SGDwRegAccGradientVarianceAve[j][-200:-1].mean()))
        print("SGDwReg {} Training Loss Variance is {}".format(j + 1, SGDwRegBootstrapLossAve[j][-200:-1].mean()))
        print("SGDwReg {} Training Loss is {}".format(j + 1, SGDwRegTrainLossAve[j][-200:-1].mean()))
        print("SGDwReg {} Testing Loss is {}".format(j + 1, SGDwRegTestLossAve[j][-200:-1].mean()))
        print("SGDwReg {} Trace Product is {}".format(j + 1, SGDwRegProductsAve[j][-200:-1].mean()))
        print("SGDwReg {} Hessian Frobenius is {}".format(j + 1, SGDwRegHessianFrobeniusesAve[j][-200:-1].mean()))
        print("SGDwReg {} Hessian Trace is {}".format(j + 1, SGDwRegHessianTracesAve[j][-200:-1].mean()))
        print("SGDwReg {} Covariance Matrix Trace is {}".format(j + 1, SGDwRegCovarianceTracesAve[j][-200:-1].mean()))
        print("SGDwReg {} Accumulated Trace Product is {}".format(j + 1, SGDwRegAccProductsAve[j][-200:-1].mean()))

    for j in range(NumExp):
        print("SGDwOrigiReg {} Generalization Gap is {}".format(j+1, SGDwOrigiRegTestLossAve[j][-200:-1].mean() - SGDwOrigiRegTrainLossAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Moment is {}".format(j+1, SGDwOrigiRegSecondMoment[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Accumulated Gradient Variance is {}".format(j+1, SGDwOrigiRegAccGradientVarianceAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Training Loss Variance is {}".format(j + 1, SGDwOrigiRegBootstrapLossAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Training Loss is {}".format(j + 1, SGDwOrigiRegTrainLossAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Testing Loss is {}".format(j + 1, SGDwOrigiRegTestLossAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Trace Product is {}".format(j + 1, SGDwOrigiRegProductsAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Hessian Frobenius is {}".format(j + 1, SGDwOrigiRegHessianFrobeniusesAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Hessian Trace is {}".format(j + 1, SGDwOrigiRegHessianTracesAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Covariance Matrix Trace is {}".format(j + 1, SGDwOrigiRegCovarianceTracesAve[j][-200:-1].mean()))
        print("SGDwOrigiReg {} Accumulated Trace Product is {}".format(j + 1, SGDwOrigiRegAccProductsAve[j][-200:-1].mean()))
